clear all
set more off
cap log close

*@cd "YOUR WORKING DIRECTORY HERE"
cd "P:\NCERDC-Taylor\data\constructed\"

*@glo ncerdc_path "YOUR NCERDC DIRECTORY HERE"
glo ncerdc_path "P:\NCERDC-Taylor\data\raw\"

*===============================================================================
*	Student test score data
*===============================================================================
*-------------------------------------------------------------------------------
*	2003-2013
*-------------------------------------------------------------------------------

clear

gen year = .
gen test_grade = .

* this loop reads in NCERDC data files
forv y = 3/13 {

	forv g = 3/8 {

		if inrange( `y' ,  3 ,  9 ) append using "${ncerdc_path}Student\End of Grade\eog`g'pub0`y'"
		if inrange( `y' , 10 , 13 ) append using "${ncerdc_path}Student\End of Grade\20`y'\eog`g'pub`y'"
		
		replace test_grade = `g' if mi( test_grade )
		
	}
	
	replace year = 2000 + `y' if mi( year )
	
	if `y' == 12 destring mathpc readpc , replace
	
}

count if mi( mastid )
assert r(N) == 48
drop if mi( mastid )

keep mastid year administ test_grade mathscal readscal schlcode lea 

* reshape to individual test score observations
gen i = _n
reshape long @scal , i( i ) j( subj ) string
drop i
drop if mi( scal )

* small number missing on "administ"
* 0.15% from 2003-2012, plus all records in 2013
* if missing assume = 0
replace administ = "0" if mi( administ ) | administ == "*"
destring administ , replace

** duplicates
* complete duplicates
duplicates drop
* ~0.05% with => 2 different scores, dropping these cases
duplicates tag mastid year subj administ , gen( tag )
count if tag != 0
assert r(N) == 17655
drop if tag != 0
drop tag

tempfile eog03to13 
save `eog03to13' , replace

*-------------------------------------------------------------------------------
*	2014-2016
*-------------------------------------------------------------------------------

clear

* this loop reads in NCERDC data files
forv y = 14/16 {
	append using "${ncerdc_path}\Student\Tests\curtest_pub20`y'"
}	

count if mi( mastid )
assert r(N) == 701
drop if mi( mastid )

renvars * , lower

destring year , replace

keep if regexm( test_id , "^MA|^RD" )
drop if inlist( test_id , "RD3A" , "RD3R" , "RD3B" ) 
	gen subj = "math" if regexm( test_id , "^MA" )
replace subj = "read" if regexm( test_id , "^RD" )
destring test_id , gen( test_grade ) ignore( "MA RD" )

destring score , gen( scal ) ignore( NULL )

drop if mi( scal ) 

keep mastid year subj test_grade scal schlcode lea

gen administ = 0

** duplicates
* complete duplicates
duplicates drop
* ~1.5% with => 2 different scores, dropping these cases
duplicates tag mastid subj year , gen( tag )
count if tag != 0 & !mi( tag )
assert r(N) == 121269 
drop if tag != 0 & !mi( tag )
drop tag

*-------------------------------------------------------------------------------
*	Combine 2003-2013 with 2014-2016
*-------------------------------------------------------------------------------

append using `eog03to13' 

*-------------------------------------------------------------------------------
*	Reshape wide
*-------------------------------------------------------------------------------

reshape wide scal test_grade lea schlcode , i( mastid year subj ) j( administ )

drop if mi( scal0 )

rename test_grade0 test_grade
rename lea0 lea
rename schlcode0 schlcode

*-------------------------------------------------------------------------------
*	Standardize scores
*-------------------------------------------------------------------------------

* standardize using the initial score distribution
foreach i in mean sd {
	egen `i' = `i'( scal0 ) , by( subj year test_grade )
}

foreach i in 0 1 {
	gen std`i' = ( scal`i' - mean ) / sd
}

rename scal0 scal
rename std0 std

drop mean sd

*-------------------------------------------------------------------------------
*	Proficiency levels and RD centered scores
*
*	The cutscores are documented in "proficiency-cut-scores.dta" which
*	we created for this project, starting with the test score data and 
*	confirmed by historical documents from the state website.
*
*	Begining in 2014, there were 5 levels with the original "level 2" divided 
*	into two levels to make 5 total. Thus, for our purposes the relevant
*	passing/failing cut shifted to 3 v. 4 in 2014. To keep things consistent, 
*	in the "proficiency-cut-scores.dta" file we convert 2014-2016 back to the
*	original 4 levels. However, the file also contains the "5 level version" 
*	for those three years.
*-------------------------------------------------------------------------------

merge m:1 year test_grade subj using proficiency-cut-scores , assert( match ) nogen

gen plvl = .
forv i = 1/4 {
	replace plvl = `i' if inrange( scal , l`i'_min , l`i'_max )
}
count if mi( plvl )
assert r(N) == 40
replace plvl = 1 if scal < l1_min

gen c23_scal = scal - l3_min if inlist( plvl , 2 , 3 )	
gen c23_full_scal = scal - l3_min 	
gen c23_bel = c23_scal < 0 if !mi( c23_scal )

gen plvl1 = .
forv i = 1/4 {
	replace plvl1 = `i' if inrange( scal1 , l`i'_min , l`i'_max )
}
count if mi( plvl1 ) & !mi( scal1 )
assert r(N) == 213
replace plvl1 = 1 if scal1 < l1_min

gen c23_scal1 = scal1 - l3_min if inlist( plvl1 , 2 , 3 )	
gen c23_full_scal1 = scal1 - l3_min 	
gen c23_bel1 = c23_scal1 < 0 if !mi( c23_scal1 )

drop *min* *max*

*-------------------------------------------------------------------------------
*	Future / lagged outcomes
*-------------------------------------------------------------------------------

egen idsubj = group( mastid subj )

tsset idsubj year

gen std_tm1 =  l.std
gen std_tp1 = f1.std
gen std_tp2 = f2.std
gen scal_tp1 = f.scal

gen retained_tp1 = test_grade == f.test_grade if c23_scal > -4 

drop idsubj

*-------------------------------------------------------------------------------
*	Reshape, move reading scores
*-------------------------------------------------------------------------------

preserve

keep if subj == "read"

loc list std_tm1 std std_tp1 c23_bel c23_scal c23_full_scal

keep mastid year `list'

renvars `list' , suffix( _read )

tempfile reading
save `reading' , replace

restore

keep if subj == "math"

merge 1:1 mastid year using `reading' , nogen

*-------------------------------------------------------------------------------
*	Finish up
*-------------------------------------------------------------------------------

loc list mastid year test_grade lea schlcode ///
		 std scal plvl ///
		 c23_scal c23_full_scal c23_bel  ///
		 std_tm1 std_tp1 std_tp2 scal_tp1 ///
		 std1 ///
		 c23_scal1 c23_full_scal1 c23_bel1 ///
		 std_read ///
		 c23_scal_read c23_full_scal_read c23_bel_read ///
		 std_tm1_read std_tp1_read ///
		 retained_tp1  
keep `list'
order `list'

save input-test-scores , replace

*===============================================================================
*	Non-test score student characteristics data
*===============================================================================
*-------------------------------------------------------------------------------
*	ACCDEMO data
*-------------------------------------------------------------------------------

* this loop reads in NCERDC data files
forv y = 6/16 {

	if inrange( `y' ,  6 ,  9 ) ///
		use "${ncerdc_path}\Student\MBuild\ACCDEMO\accdemopub200`y'" , clear
	if inrange( `y' , 13 , 14 ) ///
		use "${ncerdc_path}\Student\MBuild\ACCDEMO\accdemo_pub20`y'" , clear
	if  inlist( `y' , 10 , 11 , 12 , 15 , 16 ) ///
		use "${ncerdc_path}\Student\MBuild\ACCDEMO\accdemopub20`y'"  , clear
		
	gen year = 2000 + `y' 
	
	drop if mi( mastid )
	
	gen female = sex == "F" if inlist( sex , "F" , "M" )
	
	if inrange( `y' , 9 , 12 ) { 
	
		destring daysabs daysmem exc_abs unexc_abs days_in_susp days_out_susp ///
				 times_tardy times_in_susp times_out_susp ///
			, replace ignore( NULL )

	}	
		
	if inrange( `y' , 9 , 16 ) {
	
		replace ethnic="1" if ethnic=="I" | ethnic=="AMIN"
		replace ethnic="5" if ethnic=="W" | ethnic=="WHTE"
		replace ethnic="4" if ethnic=="B" | ethnic=="BLCK"
		replace ethnic="2" if ethnic=="A" | ethnic=="ASIA"
		replace ethnic="3" if ethnic=="H" | ethnic=="HISP"
		replace ethnic="6" if ethnic=="M" | ethnic=="MULT"
		replace ethnic="2" if ethnic=="P"
		replace ethnic="" if ethnic=="N" | ethnic=="NULL"
		destring ethnic, replace
	
	}
	
	if inrange( `y' , 6 , 12 ) {
	
		keep mastid year grade female ethnic daysabs daysmem bdate ///
					 exc_abs unexc_abs days_in_susp days_out_susp ///
					 times_tardy times_in_susp times_out_susp
	
	}

	if inrange( `y' , 13 , 15 ) {
	
		keep mastid year grade female ethnic daysabs daysmem bdate 
	
	}	

	if inrange( `y' , 16 , 16 ) {
	
		keep mastid year grade female ethnic         daysmem bdate 
	
	}		
	
	tempfile accdemo`y'
	save `accdemo`y'' , replace
	
}

use `accdemo6' , clear
forv y = 7/16 {
	append using `accdemo`y''
}

replace grade = "0" if grade == "0K"
replace grade = "-1" if grade == "PK"
destring grade , replace ignore( NU )

** duplicates
duplicates drop
* still some duplicates
* some because only "daysmem" differs
* assume record with larger number of daysmem is more accurate, likely recorded later
egen max = max( daysmem ) , by( mastid year )
keep if daysmem == max
drop max 
* some because of "daysabs", keeping larger number on the assumption that it is cumulative reporting
egen max = max( daysabs ) , by( mastid year )
keep if daysabs == max
drop max
* trivial number remain, < 0.05%, randomly keeping one
duplicates tag mastid year , gen( tag )
set seed 4961232
gen rand = uniform()
  sort mastid year   rand
bysort mastid year ( rand ) : keep if _n == _N
drop rand tag

isid mastid year

tempfile accdemo 
save `accdemo' , replace

*-------------------------------------------------------------------------------
*	MBuild/PCAudit data
*-------------------------------------------------------------------------------

clear

gen year = .

* these two loops read in NCERDC data files
forv y = 2/12 {

	if inrange( `y' ,  2 ,  9 ) append using "${ncerdc_path}\Student\MBuild\mb_200`y'_pub"
	if inrange( `y' , 10 , 12 ) append using "${ncerdc_path}\Student\MBuild\mb_20`y'_pub"
		
	replace year = 2000 + `y' if mi( year )
	
}

destring daysmem , replace ignore( NULL )

forv y = 13/16 {

	if inrange( `y' , 13 , 16 ) append using "${ncerdc_path}\Student\MBuild\pcaudit_pub20`y'"
		
	replace year = 2000 + `y' if mi( year )
	
}

drop if mi( mastid )

destring grade , replace ignore( GR NU SS )

gen female = sex == "F" if inlist( sex , "F" , "M" )

replace ethnic = ethnicity if mi( ethnic )
replace ethnic="1" if ethnic=="I" | ethnic=="AMIN"
replace ethnic="5" if ethnic=="W" | ethnic=="WHTE"
replace ethnic="4" if ethnic=="B" | ethnic=="BLCK"
replace ethnic="2" if ethnic=="A" | ethnic=="ASIA"
replace ethnic="3" if ethnic=="H" | ethnic=="HISP"
replace ethnic="6" if ethnic=="M" | ethnic=="MULT"
replace ethnic="2" if ethnic=="P"
replace ethnic="" if ethnic=="N"
destring ethnic, replace

replace bdate = birthdt if mi( bdate )

replace eds = "1" if eds == "Y"
replace eds = "0" if eds == "N"
replace eds = "1" if  inlist( frl , "F" , "R" , "T" ) & inrange( year , 2006 , 2007 )
replace eds = "0" if !inlist( frl , "F" , "R" , "T" ) & inrange( year , 2006 , 2007 )
replace eds = ""  if year <= 2005
destring eds , replace

replace swd = "0" if inlist( swd , "N" , "U" ) & inrange( year , 2009 , 2016 )
replace swd = "1" if swd == "Y" 			   & inrange( year , 2009 , 2016 )
replace swd = "0" if  inlist( ec , "NULL" , "NU" , "01" ) & inrange( year , 2006 , 2008 )
replace swd = "1" if !inlist( ec , "NULL" , "NU" , "01" ) & inrange( year , 2006 , 2008 )
replace swd = "" if year <= 2005
destring swd , replace

replace lep = lep_current if mi( lep )
replace lep = "1" if inlist( lep , "Y" , "U" ) & inrange( year , 2006 , 2016 )
replace lep = "0" if lep == "N" 			   & inrange( year , 2006 , 2016 )
replace lep = "1" if  inrange( LEPflag , 1 , 8 ) & inrange( year , 2004 , 2005 )
replace lep = "0" if !inrange( LEPflag , 1 , 8 ) & inrange( year , 2004 , 2005 )
replace lep = "" if year <= 2003
destring lep , replace

replace daysmem = daysNIV if mi( daysmem )

keep mastid year grade female ethnic bdate eds swd lep daysabs daysmem

** duplicates
duplicates drop
* still some duplicates
* assume record with larger number of daysmem is more recent
egen max = max( daysmem ) , by( mastid year )
keep if daysmem == max
drop max 
* some because of daysabs, keeping larger number on the assumption that it is cumulative reporting
egen max = max( daysabs ) , by( mastid year )
keep if daysabs == max
drop max
* trivial number remain, < 0.05%, randomly keeping one
duplicates tag mastid year , gen( tag )
set seed 4961232
gen rand = uniform()
  sort mastid year   rand
bysort mastid year ( rand ) : keep if _n == _N
drop rand tag

*-------------------------------------------------------------------------------
*	Combine
*-------------------------------------------------------------------------------

merge 1:1 mastid year using `accdemo' , update nogen

gen white = ethnic == 5 if !mi( ethnic )

* lagged and lead days absent
tsset mastid year
gen daysabs_tp1 = f.daysabs
gen daysabs_tm1 = l.daysabs

loc list mastid year female white eds swd lep daysabs_tm1 daysabs daysabs_tp1 daysmem 
keep `list'
order `list'

save input-student-chars , replace

*===============================================================================
*	Student-teacher links
*===============================================================================

clear

gen year = .

* this loop reads in NCERDC data files
forv y = 7/13 {

	loc year = 2000 + `y'
	
	append using "${ncerdc_path}\Student\Course Membership\crs_memb_pub`year'"
	
	replace year = `year' if mi( year )
	
}

drop if mi( mastid ) | mi( teachid ) 

keep if inlist( grade , "03" , "04" , "05" , "06" , "07" , "08" )

* primary math courses ( or self-contained elementary classes )
keep if inlist( substr( localcourse , 1 , 4 ) , "0000" , "2001" , "2003" , "2020" , "2023" )

* 93% have only one math teacher
egen ntids1 = nvals( teachid ) , by( mastid year )

* if more than one teacher, and both "0" and "2" course(s), keep "2" course(s)
gen cc_1 = real( substr( localcourse , 1 , 1 ) )
egen nv_cc_1 = nvals( cc_1 ) , by( mastid year ) 
drop if ntids1 >= 2 & nv_cc_1 == 2 & cc_1 == 0

* if more than one teacher, and differ by spring and fall data, keep fall data
egen ntids2 = nvals( teachid ) , by( mastid year )
egen nv_collection_code = nvals( collection_code ) , by( mastid year )
drop if ntids1 >= 2 & nv_collection_code == 2 & collection_code == "FDS" 

* if still more than 1, drop entire student, less than 1% of students
egen ntids3 = nvals( teachid ) , by( mastid year )
drop if ntids3 > 1

keep mastid year teachid
duplicates drop

isid mastid year

save input-links , replace

*===============================================================================
*	Combine into analysis file, construct additional vars
*===============================================================================

use input-test-scores , clear
merge 1:1 mastid year using input-student-chars , keep( master match ) nogen
merge 1:1 mastid year using input-links         , keep( master match ) nogen

gen retest = inrange( year , 2009 , 2012 )

egen gby   = group( test_grade year )
egen gbybs = group( test_grade year lea schlcode )

*-------------------------------------------------------------------------------
*	Value-added (using vam.ado from Chetty, Friedman, and Rockoff 2014)
*-------------------------------------------------------------------------------

preserve

foreach v in female white lep eds swd {
	gen mi_`v' = mi( `v' )
	replace `v' = 0 if mi( `v' )
}

egen tby = group( teachid year )

loc x c.std_tm1##c.std_tm1##i.test_grade##i.year ///
	  female mi_female white mi_white lep mi_lep eds mi_eds swd mi_swd

vam std , teacher( teachid ) year( year ) class( tby ) ///
	controls( `x' ) tfx_resid( teachid ) ///
	driftlimit( 6 ) ///
	output( cfrvam )

restore

merge m:1 teachid year using cfrvam , assert( master match ) nogen

rename tv tva

*-------------------------------------------------------------------------------
*	Class peer characteristics
*-------------------------------------------------------------------------------

* peer mean baseline score
egen pmbs = mean( std_tm1 ) if !mi( teachid ) , by( teachid year )

* peer mean failed baseline
tsset mastid year
gen _fail_tm1 = l.c23_full_scal < 0 if !mi( l.c23_full_scal )
egen pmfb = mean( _fail_tm1 ) if !mi( teachid ) , by( teachid year )
drop _fail_tm1

*-------------------------------------------------------------------------------
*	Laggs/leads
*-------------------------------------------------------------------------------

tsset mastid year

gen tva_tp1 = f.tva
gen pmbs_tp1 = f.pmbs
gen pmfb_tp1 = f.pmfb

gen ret_in_tm1 = l.test_grade == test_grade if !mi( l.test_grade ) & inrange( test_grade , 4 , 8 ) 

*-------------------------------------------------------------------------------
*	LEAs retesting prior to 2009
*-------------------------------------------------------------------------------

preserve

* the data for pre-2009 retests is only available in 2008 for math
keep if year == 2008 

* students that could have been retested at the LEA's discretion 
keep if inlist( test_grade , 3 , 5 , 8 ) & inrange( c23_scal , -3 , -1 )

* students who were retested
gen retested = !mi( std1 )

* collapse to LEA-level data
collapse (mean) retested , by( lea )

gen sem_lea = retested == 0 
drop retested

tempfile sem_lea
save `sem_lea' , replace

restore

merge m:1 lea using `sem_lea' , nogen assert( master match )

replace sem_lea = 0 if mi( sem_lea )
		
*-------------------------------------------------------------------------------
*	Finish up
*-------------------------------------------------------------------------------

compress
save effort-evaluation-data , replace

do "P:\NCERDC-Taylor\programs\evaluation-effort-analysis.do"
